{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.base import BaseEstimator, ClassifierMixin\n",
    "from sklearn.utils.validation import check_X_y, check_is_fitted\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from scipy import sparse\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn.feature_extraction.text import HashingVectorizer\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "from sklearn import metrics\n",
    "\n",
    "class NB_SVM(BaseEstimator, ClassifierMixin):\n",
    "    def __init__(self, C=1.0, dual=False, n_jobs=1):\n",
    "        self.C = C\n",
    "        self.dual = dual\n",
    "        self.n_jobs = n_jobs\n",
    "\n",
    "    def predict(self, x):\n",
    "        # Verify that model has been fit\n",
    "        check_is_fitted(self, ['_r', '_clf'])\n",
    "        return self._clf.predict(x.multiply(self._r))\n",
    "\n",
    "    def predict_proba(self, x):\n",
    "        # Verify that model has been fit\n",
    "        check_is_fitted(self, ['_r', '_clf'])\n",
    "        return self._clf.predict_proba(x.multiply(self._r))\n",
    "\n",
    "    def fit(self, x, y):\n",
    "        # Check that X and y have correct shape\n",
    "        x, y = check_X_y(x, y, accept_sparse=True)\n",
    "\n",
    "        def pr(x, y_i, y):\n",
    "            p = x[y==y_i].sum(0)\n",
    "            return (p+1) / ((y==y_i).sum()+1)\n",
    "\n",
    "        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))\n",
    "        x_nb = x.multiply(self._r)\n",
    "        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)\n",
    "        return self"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sklearn.datasets\n",
    "import re\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clearstring(string):\n",
    "    string = re.sub('[^\\'\\\"A-Za-z0-9 ]+', '', string)\n",
    "    string = string.split(' ')\n",
    "    string = filter(None, string)\n",
    "    string = [y.strip() for y in string]\n",
    "    string = ' '.join(string)\n",
    "    return string\n",
    "\n",
    "def separate_dataset(trainset):\n",
    "    datastring = []\n",
    "    datatarget = []\n",
    "    for i in range(len(trainset.data)):\n",
    "        data_ = trainset.data[i].split('\\n')\n",
    "        data_ = list(filter(None, data_))\n",
    "        for n in range(len(data_)):\n",
    "            data_[n] = clearstring(data_[n])\n",
    "        datastring += data_\n",
    "        for n in range(len(data_)):\n",
    "            datatarget.append(trainset.target[i])\n",
    "    return datastring, datatarget"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']\n",
      "416809\n",
      "416809\n"
     ]
    }
   ],
   "source": [
    "trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
    "trainset.data, trainset.target = separate_dataset(trainset)\n",
    "print (trainset.target_names)\n",
    "print (len(trainset.data))\n",
    "print (len(trainset.target))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/logistic.py:1228: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = -1.\n",
      "  \" = {}.\".format(self.n_jobs))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8491267632664812\n",
      "0.8479992323193551\n",
      "0.8436735281416439\n",
      "0.8561009548486157\n",
      "0.8522828147117392\n",
      "0.8478203497996689\n",
      "0.8455171421031166\n",
      "0.8435865444599069\n",
      "0.8466097221555737\n",
      "0.8471855655261769\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "cv = StratifiedKFold(n_splits=10, shuffle=True)\n",
    "X_raw, y_raw = np.array(trainset.data), np.array(trainset.target)\n",
    "\n",
    "aucs = []\n",
    "for train, test in cv.split(X_raw, y_raw):\n",
    "    y_train = y_raw[train]\n",
    "    X_train = X_raw[train]\n",
    "\n",
    "    y_test = y_raw[test]\n",
    "    X_test = X_raw[test]\n",
    "    \n",
    "    tfidf = TfidfVectorizer().fit(X_train)\n",
    "    X_train_tr = tfidf.transform(X_train)\n",
    "    X_test_tr = tfidf.transform(X_test)\n",
    "    \n",
    "    model = NB_SVM(C=1, dual=True, n_jobs=-1).fit(X_train_tr, y_train)\n",
    "    y_preds = model.predict(X_test_tr)\n",
    "    accuracy = accuracy_score(y_test, y_preds)\n",
    "    aucs.append(accuracy)\n",
    "    print(accuracy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/feature_extraction/hashing.py:94: DeprecationWarning: the option non_negative=True has been deprecated in 0.19 and will be removed in version 0.21.\n",
      "  \" in version 0.21.\", DeprecationWarning)\n",
      "/usr/local/lib/python3.5/dist-packages/sklearn/feature_extraction/hashing.py:94: DeprecationWarning: the option non_negative=True has been deprecated in 0.19 and will be removed in version 0.21.\n",
      "  \" in version 0.21.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "# bag-of-word\n",
    "bow = CountVectorizer().fit_transform(trainset.data)\n",
    "\n",
    "#tf-idf, must get from BOW first\n",
    "tfidf = TfidfTransformer().fit_transform(bow)\n",
    "\n",
    "#hashing, default n_features, probability cannot divide by negative\n",
    "hashing = HashingVectorizer(non_negative = True).fit_transform(trainset.data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/logistic.py:1228: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = -1.\n",
      "  \" = {}.\".format(self.n_jobs))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy validation set:  0.88038914613373\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "      anger       0.89      0.90      0.90     11438\n",
      "       fear       0.84      0.84      0.84      9545\n",
      "        joy       0.89      0.92      0.91     28253\n",
      "       love       0.79      0.72      0.75      6951\n",
      "    sadness       0.92      0.91      0.92     24218\n",
      "   surprise       0.71      0.71      0.71      2957\n",
      "\n",
      "avg / total       0.88      0.88      0.88     83362\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)\n",
    "\n",
    "model = NB_SVM(C=1, dual=True, n_jobs=-1).fit(train_X, train_Y)\n",
    "\n",
    "predicted = model.predict(test_X)\n",
    "\n",
    "print('accuracy validation set: ', np.mean(predicted == test_Y))\n",
    "\n",
    "# print scores\n",
    "print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/logistic.py:1228: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = -1.\n",
      "  \" = {}.\".format(self.n_jobs))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy validation set:  0.8463568532424846\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "      anger       0.91      0.89      0.90     11364\n",
      "       fear       0.86      0.82      0.84      9455\n",
      "        joy       0.80      0.94      0.86     28262\n",
      "       love       0.87      0.47      0.61      6880\n",
      "    sadness       0.89      0.86      0.88     24376\n",
      "   surprise       0.72      0.59      0.65      3025\n",
      "\n",
      "avg / total       0.85      0.85      0.84     83362\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(tfidf, trainset.target, test_size = 0.2)\n",
    "\n",
    "model = NB_SVM(C=1, dual=True, n_jobs=-1).fit(train_X, train_Y)\n",
    "\n",
    "predicted = model.predict(test_X)\n",
    "\n",
    "print('accuracy validation set: ', np.mean(predicted == test_Y))\n",
    "\n",
    "# print scores\n",
    "print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/logistic.py:1228: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = -1.\n",
      "  \" = {}.\".format(self.n_jobs))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy validation set:  0.8227009908591445\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "      anger       0.90      0.89      0.90     11454\n",
      "       fear       0.87      0.83      0.85      9515\n",
      "        joy       0.76      0.94      0.84     28277\n",
      "       love       0.88      0.38      0.53      6949\n",
      "    sadness       0.87      0.83      0.84     24221\n",
      "   surprise       0.70      0.47      0.56      2946\n",
      "\n",
      "avg / total       0.83      0.82      0.81     83362\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(hashing, trainset.target, test_size = 0.2)\n",
    "\n",
    "model = NB_SVM(C=1, dual=True, n_jobs=-1).fit(train_X, train_Y)\n",
    "\n",
    "predicted = model.predict(test_X)\n",
    "\n",
    "print('accuracy validation set: ', np.mean(predicted == test_Y))\n",
    "\n",
    "# print scores\n",
    "print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
